import nltk
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from faiq_utils import plot_wordcloud, barplot_cvec, \
stem_text
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', None)
%matplotlib inline
# download stopwords for NLTK
nltk.download('stopwords')
df_transaction = pd.read_csv('../data/bank_transaction.csv', parse_dates=['txn_date'])
df_user = pd.read_csv('../data/user_profile.csv')
df_transaction.head(3)
Here, we notice several missing values in the category column. This will prompt further investigation below.
df_transaction.info()
df_user.head(3)
df_user.info()
bank_transaction.csv¶category column & amount column distribution¶We find 0.1% of transactions aren't classified into any categories and come from 2 clients with ID's 880 and 788. When we look into the amount of money being transacted, they fall well within the range of they fall within $200 though half of them are money coming in. This is in start contrast to transactions with no null values in their categories with the majority of money going out.
More data will be required to ascertain why these clients in particular are facing this issue. Hence, in our preprocessing, we'll choose to drop these transactions, especially since they account for such a small amount.
df_transaction.loc[df_transaction['category'].isnull()].head(10)
# Calculate the percentage the number of null rows in the 'category' column and total number of rows
null_percentage = (df_transaction['category'].isnull().sum() / len(df_transaction)) * 100
# Print the result
print(f"Percentage of null values in 'category': {null_percentage:.2f}%")
df_transaction.loc[df_transaction['category'].isnull()]['client_id'].value_counts()
df_user.loc[df_user['CLIENT_ID'].isin([880, 788])]
df_transaction.loc[(df_transaction['client_id'] == 880) & (~df_transaction['category'].isnull())].head(10)
df_transaction.loc[(df_transaction['client_id'] == 788) & (~df_transaction['category'].isnull())].head(10)
df_transaction.loc[df_transaction['category'].isnull()]['bank_id'].value_counts()
The descriptions don't point to anything significant either for why the category column is null.
df_transaction.loc[df_transaction['category'].isnull()]['description'].value_counts()
# Plot distribution of the 'amount' column for Null Categories
plt.figure(figsize=(10, 6))
sns.histplot(df_transaction.loc[df_transaction['category'].isnull()]['amount'], bins=10, color='skyblue', edgecolor='black')
plt.title("Distribution of Transaction Amounts (Null Categories)", fontsize=16, weight='bold')
plt.xlabel("Amount", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()
df_transaction.loc[~df_transaction['category'].isnull()]['amount'].describe()
# Plot distribution of the 'amount' column
plt.figure(figsize=(10, 6))
sns.histplot(df_transaction.loc[~df_transaction['category'].isnull()]['amount'], bins=20, color='red', edgecolor='black')
plt.title("Distribution of Transaction Amounts (Non-Null Categories)", fontsize=16, weight='bold')
plt.xlabel("Amount", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()
With the analysis above, we decide to drop rows with null values in the category column.
df_transaction = df_transaction.loc[~df_transaction['category'].isnull()]
With only 880 clients (out of 1000) making transactions, there is an opportunity to reach out to these clients to get them to be more active.
print(f'No. of clients with transactions: {df_transaction.client_id.nunique()}')
print(f'No. of banks: {df_transaction.bank_id.nunique()}')
print(f'No. of accounts: {df_transaction.account_id.nunique()}')
print(f'No. of transactions: {df_transaction.txn_id.nunique()}')
From the word cloud below, we notice that there's a seasonal pattern for transaction volume in a particular week. For example, the number of transactions are highest on Mondays, followed by Fridays. They are at their lowest on the weekends.
When building our classifier, perhaps there's different type of transactions that are being done over weekdays and weekends which may be a very useful feature when developing our model. We'll explore this below when we compare the top n-grams that appear between the different days.
# Count the number of transactions per date
txn_count = df_transaction["txn_date"].dt.date.value_counts().sort_index()
# Plot the distribution
plt.figure(figsize=(30, 6))
txn_count.plot(kind="bar", color="skyblue")
plt.title("Distribution of Transaction Dates", fontsize=14)
plt.xlabel("Transaction Date", fontsize=12)
plt.ylabel("Number of Transactions", fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.tight_layout()
plt.show()
For pre-processing, we aim to drop rows with no meaningful words. I define rows with no meaningful words as:
To do this, I'll aim to create WordClouds to identify the top words among the 7 most sizeable categories (due to them accounting for 60% of trasactions) and look out for transactions that aren't indicative of intent in their description.
From our analysis, we find the following:
Once, we've identified our stopwords, we pre-process our datasets with the following steps to generate a description_stem column for n-gram analysis in a later section.
df_transaction.category.value_counts(normalize=True)
sizeable_categories = [
'Uncategorized',
'Third Party',
'Restaurants',
'Transfer Credit',
'Loans',
'Convenience Stores',
'Supermarkets and Groceries',
]
for x in sizeable_categories:
plot_wordcloud(
category=x,
title=f'Most Frequently Used Words among the {x} Category',
df=df_transaction,
text_col_in_df='description',
max_words=500,
)
df_transaction.loc[df_transaction['description'].str.contains('Maryse|Maryse Hemant|Hemant')]
df_transaction.loc[df_transaction['description'].str.contains('Maryse|Maryse Hemant|Hemant')].category.value_counts(normalize=True)
df_transaction.loc[df_transaction['description'].str.contains('CHECK111')]
df_transaction.loc[df_transaction['description'].str.contains('CHECK111')].category.value_counts(normalize=True)
df_transaction.loc[df_transaction['description'].str.contains('MoneyLion')]
df_transaction.loc[df_transaction['description'].str.contains('MoneyLion')].category.value_counts(normalize=True)
We begin our pre-processing step below. We use a helper function stem_text() to do this, as well as remove transactions that are only comprised of 'CHECK111'. The stem_text() function performs the following:
df_transaction = df_transaction.loc[~df_transaction['description'] == 'CHECK111']
stopwords_list = ['maryse', 'maryse hemant', 'hemant']
# Pre-process the raw text for uni-gram/bi-gram analysis
df_transaction['description_stem'] = df_transaction['description'].map(lambda x: stem_text(raw_text=x, stopwords_list=stopwords_list))
df_transaction.head(3)
# Reindex the dataframe
df_transaction.reset_index(drop=True, inplace=True)
df_transaction.loc[3]['description']
df_transaction.loc[3]['description_stem']
As stated in the previous section, for pre-processing, we aim to drop rows with no meaningful words. These are:
# Find the number of meaningful words in each transaction
df_transaction['description_clean_len'] = df_transaction['description_stem'].str.split().map(len)
# There are 8 transactions that do not have any meaningful words
df_transaction[df_transaction['description_clean_len']==0].shape
df_transaction[df_transaction['description_clean_len']==0]
# Drop the transactions that do not have any meaningful words
df_transaction = df_transaction.drop(df_transaction[df_transaction['description_clean_len']==0].index)
# Reindex the dataframe
df_transaction.reset_index(drop=True, inplace=True)
We noticed eaerlier that there was difference in frequency of transactions across time e.g., the no. of transactions on weekends was significantly less incomparion to weekdays (especially Mondays and Fridays). Hence, it may be useful to compare n-grams of transactions that occur on the different timeframes. Unfortunately, due to compute constraints, we're unable to create n-grams for the entire dataframe. Hence, we'll sample these transactions for simplicity.
barplot_cvec(
df=df_transaction[df_transaction["txn_date"].dt.day_name().isin(['Monday'])],
titles=[f'Top 20 Uni-grams - Monday', f'Top 20 Bi-grams - Monday', f'Top 20 Tri-grams - Monday'],
color='#CB4335',
xlimit=(0,2500)
)
We notice a lot more spending in purchases/purchase authorized (which are normally associated with Restaurants/SuperMarkets and Groceries/Convenience Stores transactions) being made on Mondays. This clues us in that maybe most customers prefer to do their groceries on Mondays. Debit Card Purchases being highest on Monday (in terms of 3-grams) also supports this. This in comparison to the weekend where a lot of cash transfers are being made (from the 3-grams).
Hence, for feature engineering, it may be wise to include a feature detailing what day of the week the transaction is being made on.
barplot_cvec(
df=df_transaction[df_transaction["txn_date"].dt.day_name().isin(['Monday'])].sample(frac=0.2, random_state=42),
titles=[f'Top 20 Uni-grams - Monday', f'Top 20 Bi-grams - Monday', f'Top 20 Tri-grams - Monday'],
color='#CB4335',
xlimit=(0,2500)
)
weekends = ["Sunday", "Saturday"]
barplot_cvec(
df=df_transaction[df_transaction["txn_date"].dt.day_name().isin(weekends)],
titles=[f'Top 20 Uni-grams - Weekends', f'Top 20 Bi-grams - Weekends', f'Top 20 Tri-grams - Weekends'],
color='#CB4335',
xlimit=(0,2500)
)
sizeable_categories = [
'Uncategorized',
'Third Party',
'Restaurants',
'Transfer Credit',
'Loans',
'Convenience Stores',
'Supermarkets and Groceries',
]
# Plot the uni-grams, bi-grams and tri-grams for sizeable categories
for i in sizeable_categories:
barplot_cvec(
df=df_transaction,
category=i,
titles=[f'Top 20 Uni-grams - {i}', f'Top 20 Bi-grams - {i}', f'Top 20 Tri-grams - {i}'],
color='#CB4335',
xlimit=(0,2500)
)
user_profile.csv¶for x in df_user.columns:
if x == 'CLIENT_ID':
continue
else:
plt.figure(figsize=(8, 5))
df_user.groupby(x)["CLIENT_ID"].count().plot(kind="bar", color=["skyblue", "orange"], edgecolor="black")
plt.xlabel(f'Is Interested in {x.replace("IS_INTERESTED", "").replace("_", " ").title()}')
plt.ylabel("Number of Clients")
plt.title("Distribution of Clients Interested in Investment")
plt.xticks([0, 1], ["False", "True"], rotation=0)
plt.show()
cat_cols = [x for x in df_user.columns if x != 'CLIENT_ID']
for i in range(7):
if i == 0:
continue
else:
total_users = df_user[cat_cols].sum(axis=1).ge(i).sum()
print(f'No. of users with {i} or more interests: {total_users} ({100 * total_users/1000:.2f}%)')
user_transaction_count = df_transaction.groupby('client_id').agg(
# we divide by 3 since our data is from June 1st to 30th September
monthly_average_transaction_count=('client_id', lambda x: round(x.count() / 3))
).reset_index()
user_transaction_count
# Plotting a histogram for monthly_average_transaction_count
plt.figure(figsize=(8, 5))
plt.hist(user_transaction_count["monthly_average_transaction_count"], bins=5, color='skyblue', edgecolor='black', alpha=0.7)
plt.xlabel("Monthly Average Transaction Count")
plt.ylabel("Frequency")
plt.title("Histogram of Monthly Average Transaction Counts")
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()
We consolidate all our feature engineering and data cleaning steps in the below function, clean_split_dataset(). We break them down step-by-step:
category column from the bank_transaction.csvstem_text() function which performs: Non-Letter Removal:, Case Normalization:,Tokenization:, Stopword Removal:, Stemming:. We do this to identify transactions comprised of only stop words and non-alphanumerics (which we then remove as they are not meaningful).user_profile.csv, we adopt the intent columns in spite of their sparsity for experimentation.20% of the original dataset will be set aside and used as a test set. This will be useful in evaluating our model's performance on unseen data. We will also use stratification to preserve the class representation in our train and test set.
def clean_split_dataset():
"""Pre-processing and Feature Engineering function."""
df_transaction = pd.read_csv('../data/bank_transaction.csv', parse_dates=['txn_date'])
df_user = pd.read_csv('../data/user_profile.csv')
##### Transaction-level features #####
df_transaction = df_transaction.loc[~df_transaction['category'].isnull()]
df_transaction = df_transaction.loc[~df_transaction['description'].str.contains('CHECK111')]
# Pre-process the raw text to remove stopwords
stopwords_list = ['maryse', 'maryse hemant', 'hemant']
df_transaction['description_stem'] = df_transaction['description'].map(lambda x: stem_text(raw_text=x, stopwords_list=stopwords_list))
# Find the number of meaningful words in each transaction
df_transaction['description_clean_len'] = df_transaction['description_stem'].str.split().map(len)
# Drop the transactions that do not have any meaningful words e.g., comprised of stopwords and numbers only
df_transaction = df_transaction.drop(df_transaction[df_transaction['description_clean_len']==0].index)
# Since we identified that the number of transactions differ between days of the week, we'll create features corresponding
# to the day of the transaction
df_transaction['day_monday'] = np.where(df_transaction["txn_date"].dt.day_name() == 'Monday', 1, 0)
df_transaction['day_friday'] = np.where(df_transaction["txn_date"].dt.day_name() == 'Friday', 1, 0)
df_transaction['day_weekend'] = np.where(df_transaction["txn_date"].dt.day_name().isin(['Saturday', 'Sunday']), 1, 0)
##### User-level features #####
categories = list(df_transaction.category.unique())
user_transaction_count = df_transaction.groupby('client_id').agg(
monthly_transaction_count=('client_id', lambda x: round(x.count() / 3)),
**{
f"monthly_transaction_count_{str(category).lower().replace(' ', '_')}": (
'category',
lambda x, cat=category: round((x == cat).sum() / 3)
)
for category in categories
}
).reset_index()
user_features = user_transaction_count.merge(df_user.astype(int), how='left', left_on='client_id', right_on='CLIENT_ID')
user_features = user_features.drop(columns=['CLIENT_ID'])
##### Join User & Transaction-level features together #####
df_final = df_transaction.merge(user_features, how='left', on='client_id')
##### Train-test Split
# As we would like to stratify our target variable, we will need to first assign X and y
X = df_final[[cols for cols in df_final.columns if cols != 'category']]
y = df_final['category']
# Perform a train_test_split to create a train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# Merge X_train and y_train back together using index
train = pd.merge(X_train, y_train, left_index=True, right_index=True)
# Merge X_test and y_test back together using index
test = pd.merge(X_test, y_test, left_index=True, right_index=True)
# Reindex the train and test set
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
print(f'No. of unique categories in Train: {train.category.nunique()}')
print(f'No. of unique categories in Test: {test.category.nunique()}')
print("")
print(f'Train size: {train.shape}')
print(f'Test size: {test.shape}')
# Save clean training set
train.to_csv('../data/train.csv', index=False)
test.to_csv('../data/test.csv', index=False)
print("")
print('Finished preparing datasets!')
return (train, test)
train, test = clean_split_dataset()
train.head(3)
Let's do one final check on null values!
train.info()
test.info()